import numpy as np
import pandas as pd
import seaborn as sns
sns.set_palette('husl')
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import metrics
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
data = pd.read_csv('Iris.csv')
data.head()
| Id | SepalLengthCm | SepalWidthCm | PetalLengthCm | PetalWidthCm | Species | |
|---|---|---|---|---|---|---|
| 0 | 1 | 5.1 | 3.5 | 1.4 | 0.2 | Iris-setosa |
| 1 | 2 | 4.9 | 3.0 | 1.4 | 0.2 | Iris-setosa |
| 2 | 3 | 4.7 | 3.2 | 1.3 | 0.2 | Iris-setosa |
| 3 | 4 | 4.6 | 3.1 | 1.5 | 0.2 | Iris-setosa |
| 4 | 5 | 5.0 | 3.6 | 1.4 | 0.2 | Iris-setosa |
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 150 entries, 0 to 149 Data columns (total 6 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Id 150 non-null int64 1 SepalLengthCm 150 non-null float64 2 SepalWidthCm 150 non-null float64 3 PetalLengthCm 150 non-null float64 4 PetalWidthCm 150 non-null float64 5 Species 150 non-null object dtypes: float64(4), int64(1), object(1) memory usage: 7.2+ KB
data.isnull().any()
Id False SepalLengthCm False SepalWidthCm False PetalLengthCm False PetalWidthCm False Species False dtype: bool
data.dtypes
Id int64 SepalLengthCm float64 SepalWidthCm float64 PetalLengthCm float64 PetalWidthCm float64 Species object dtype: object
data.shape
(150, 6)
data.describe()
| Id | SepalLengthCm | SepalWidthCm | PetalLengthCm | PetalWidthCm | |
|---|---|---|---|---|---|
| count | 150.000000 | 150.000000 | 150.000000 | 150.000000 | 150.000000 |
| mean | 75.500000 | 5.843333 | 3.054000 | 3.758667 | 1.198667 |
| std | 43.445368 | 0.828066 | 0.433594 | 1.764420 | 0.763161 |
| min | 1.000000 | 4.300000 | 2.000000 | 1.000000 | 0.100000 |
| 25% | 38.250000 | 5.100000 | 2.800000 | 1.600000 | 0.300000 |
| 50% | 75.500000 | 5.800000 | 3.000000 | 4.350000 | 1.300000 |
| 75% | 112.750000 | 6.400000 | 3.300000 | 5.100000 | 1.800000 |
| max | 150.000000 | 7.900000 | 4.400000 | 6.900000 | 2.500000 |
data['Species'].value_counts()
Iris-virginica 50 Iris-versicolor 50 Iris-setosa 50 Name: Species, dtype: int64
species_counts = sns.countplot(x=data['Species'])
plt.xlabel('Species')
plt.ylabel('Count')
plt.title('Bar Chart of Iris Species Counts')
plt.show()
tmp = data.drop('Id', axis=1)
g = sns.pairplot(tmp, hue='Species', markers='+')
plt.show()
g = sns.violinplot(y='Species', x='SepalLengthCm', data=data, inner='quartile')
plt.show()
g = sns.violinplot(y='Species', x='SepalWidthCm', data=data, inner='quartile')
plt.show()
g = sns.violinplot(y='Species', x='PetalLengthCm', data=data, inner='quartile')
plt.show()
g = sns.violinplot(y='Species', x='PetalWidthCm', data=data, inner='quartile')
plt.show()
X = data.drop(['Id', 'Species'], axis=1)
y = data['Species']
# print(X.head())
print(X.shape)
# print(y.head())
print(y.shape)
(150, 4) (150,)
fig = plt.figure()
ax = fig.add_axes([0,0,1,1])
ax.axis('equal')
l = ['Versicolor', 'Setosa', 'Virginica']
s = [50,50,50]
ax.pie(s, labels = l,autopct='%1.2f%%')
plt.show()
k_range = list(range(1,26))
scores = []
for k in k_range:
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X, y)
y_pred = knn.predict(X)
scores.append(metrics.accuracy_score(y, y_pred))
plt.plot(k_range, scores)
plt.xlabel('Value of k for KNN')
plt.ylabel('Accuracy Score')
plt.title('Accuracy Scores for Values of k of k-Nearest-Neighbors')
plt.show()
logreg = LogisticRegression()
logreg.fit(X, y)
y_pred = logreg.predict(X)
print(metrics.accuracy_score(y, y_pred))
0.9733333333333334
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=5)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)
(90, 4) (90,) (60, 4) (60,)
# experimenting with different n values
k_range = list(range(1,26))
scores = []
for k in k_range:
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
scores.append(metrics.accuracy_score(y_test, y_pred))
plt.plot(k_range, scores)
plt.xlabel('Value of k for KNN')
plt.ylabel('Accuracy Score')
plt.title('Accuracy Scores for Values of k of k-Nearest-Neighbors')
plt.show()
logreg = LogisticRegression()
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)
print(metrics.accuracy_score(y_test, y_pred))
0.9833333333333333
knn = KNeighborsClassifier(n_neighbors=12)
knn.fit(X, y)
# make a prediction for an example of an out-of-sample observation
knn.predict([[6, 3, 4, 2]])
C:\Users\hp\anaconda3\lib\site-packages\sklearn\base.py:465: UserWarning: X does not have valid feature names, but KNeighborsClassifier was fitted with feature names
array(['Iris-versicolor'], dtype=object)
sns.heatmap(data.corr(), annot=True)
<AxesSubplot:>
from sklearn.tree import DecisionTreeClassifier
for feature in data.columns: # Loop through all columns in the dataframe
if data[feature].dtype == 'object': # Only apply for columns with categorical strings
data[feature] = pd.Categorical(data[feature]) # Replace strings with an integer
data.head(10)
| Id | SepalLengthCm | SepalWidthCm | PetalLengthCm | PetalWidthCm | Species | |
|---|---|---|---|---|---|---|
| 0 | 1 | 5.1 | 3.5 | 1.4 | 0.2 | Iris-setosa |
| 1 | 2 | 4.9 | 3.0 | 1.4 | 0.2 | Iris-setosa |
| 2 | 3 | 4.7 | 3.2 | 1.3 | 0.2 | Iris-setosa |
| 3 | 4 | 4.6 | 3.1 | 1.5 | 0.2 | Iris-setosa |
| 4 | 5 | 5.0 | 3.6 | 1.4 | 0.2 | Iris-setosa |
| 5 | 6 | 5.4 | 3.9 | 1.7 | 0.4 | Iris-setosa |
| 6 | 7 | 4.6 | 3.4 | 1.4 | 0.3 | Iris-setosa |
| 7 | 8 | 5.0 | 3.4 | 1.5 | 0.2 | Iris-setosa |
| 8 | 9 | 4.4 | 2.9 | 1.4 | 0.2 | Iris-setosa |
| 9 | 10 | 4.9 | 3.1 | 1.5 | 0.1 | Iris-setosa |
# Build the model
dTree = DecisionTreeClassifier(criterion = 'gini', random_state=1)
dTree.fit(X_train, y_train)
DecisionTreeClassifier(random_state=1)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
DecisionTreeClassifier(random_state=1)
# Score
print('Training score:', dTree.score(X_train, y_train))
print('Test score:', dTree.score(X_test, y_test))
Training score: 1.0 Test score: 0.95
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_estimators=200)
rfc.fit(X_train, y_train)
RandomForestClassifier(n_estimators=200)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestClassifier(n_estimators=200)
rfc_train = rfc.predict(X_train)
from sklearn import metrics
print("Accuracy_Score =", format(metrics.accuracy_score(y_train, rfc_train)))
Accuracy_Score = 1.0
from sklearn import metrics
predictions = rfc.predict(X_test)
print("Accuracy_Score =", format(metrics.accuracy_score(y_test, predictions)))
Accuracy_Score = 0.95
plt.figure(figsize=(12,10))
p = sns.heatmap(data.corr(), annot=True,cmap ='PuBuGn')
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score,mean_absolute_error
from sklearn.linear_model import LogisticRegression # for Logistic Regression algorithm
from sklearn.neighbors import KNeighborsClassifier # for K nearest neighbours
from sklearn import svm #for Support Vector Machine (SVM) Algorithm
from sklearn import metrics #for checking the model accuracy
from sklearn.tree import DecisionTreeClassifier #for using Decision Tree Algoithm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
svc_model = svm.SVC()
svc_model.fit(X_train,y_train)
y_pred_svc = svc_model.predict(X_test)
metrics.accuracy_score(y_test,y_pred_svc)
0.9833333333333333
lr_model = LogisticRegression()
lr_model.fit(X_train,y_train)
y_pred_lr = lr_model.predict(X_test)
metrics.accuracy_score(y_test,y_pred_lr)
0.9833333333333333
import numpy as np
import plotly.graph_objects as go
from sklearn import datasets
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
# Load the Iris dataset
iris = datasets.load_iris()
X = iris.data[:, :3] # Features (using only the first three features)
y = iris.target # Target labels
# Standardize the features
scaler = StandardScaler()
X_std = scaler.fit_transform(X)
# Train a KNN classifier
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_std, y)
# Create a grid of points for visualization
x_min, x_max = X_std[:, 0].min() - 1, X_std[:, 0].max() + 1
y_min, y_max = X_std[:, 1].min() - 1, X_std[:, 1].max() + 1
z_min, z_max = X_std[:, 2].min() - 1, X_std[:, 2].max() + 1
xx, yy, zz = np.meshgrid(np.arange(x_min, x_max, 0.1),
np.arange(y_min, y_max, 0.1),
np.arange(z_min, z_max, 0.1))
grid_points = np.c_[xx.ravel(), yy.ravel(), zz.ravel()]
# Predict class labels for grid points
Z = knn.predict(grid_points)
# Create a Plotly 3D scatter plot
fig = go.Figure()
# Scatter plot for data points with tooltips
tooltip_text = [f'Species: {iris.target_names[cls]}' +
f'<br>Sepal Length: {x[0]:.2f}' +
f'<br>Sepal Width: {x[1]:.2f}' +
f'<br>Petal Length: {x[2]:.2f}' for cls, x in zip(y, X_std)]
fig.add_trace(go.Scatter3d(x=X_std[:, 0], y=X_std[:, 1], z=X_std[:, 2], mode='markers',
marker=dict(size=5, color=y, colorscale='Viridis', opacity=0.8),
name='Data Points',
text=tooltip_text,
hoverinfo='text'))
# Scatter plot for decision boundaries with no tooltips
fig.add_trace(go.Scatter3d(x=grid_points[:, 0], y=grid_points[:, 1], z=grid_points[:, 2], mode='markers',
marker=dict(size=2, color=Z, colorscale='Viridis', opacity=0.03),
name='Decision Boundaries',
hoverinfo='none'))
# Customize layout with a black background
fig.update_layout(scene=dict(
xaxis_title='Sepal Length (Standardized)',
yaxis_title='Sepal Width (Standardized)',
zaxis_title='Petal Length (Standardized)',
bgcolor='black' # Set the background color to black
),
margin=dict(l=0, r=0, b=0, t=0))
# Show the interactive plot
fig.show()